##### Main analysis #####
#Remove objects
rm(list = ls())


#Load pkgs
library(tidyverse)
library(lubridate)
library(brglm)
library(sandwich)
library(lmtest)
library(margins)
library(texreg)
library(parallel)
library(xtable)

#Load helper functions
source("scripts/helper_functions.R")

#Load main dataset
df_merged_mean <- read_delim("input/df_merged_mean.csv", 
                             ";", escape_double = FALSE, col_types = cols(date_text = col_date(format = "%d/%m/%Y")), 
                             trim_ws = TRUE)

#Load topic names
newnames <- readRDS("input/newnames.rds")

#Exclude topics that correlate strongly (=>0.6) to specific websites (see "descriptives_text.R")

exclude <- c("financial_market","recession","spanish_development_aid",
             "investment_investment","opinion_economy","opinion_investment_state",
             "cyrptomoney","indigenous_groups_diseases",
             "criticism_leftist","weather","opinion_work")
names_fe <- setdiff(newnames,exclude)


#### Run models ####

#Overall, I run 5 * 39  +  1 * 86 brglm models. To adjust for multiple comparisons, I thus set
#the threshold for statistical significance accordingly; a/number of tests = 0.05/281


#Create Dummies for day for off_robust variable
df_merged_mean <- df_merged_mean %>%
  group_by(date_text) %>%
  mutate(date_pos_rob=  ifelse(var(off_rob)>0,date_text,0)) %>%
  ungroup()

#Define i as character value
i = "sanctions"

#Run function that runs brglm models taking each model separately in the equation
#Saved in helper_functions.R

models_4 <- sapply(names_fe, run_model_4_strong)

#Look for convergence status of models
converge <- unlist(models_4[6,])

#Display models with iteration limit reached
limit_reached <- str_remove(names(converge[converge==1]),".message")

#Topic names to report:
topic_names_text_fe <- c("General opinion","Salary/prices","Food policy","Exiled opposition","Court sentences/Lula da Silva",
                         "Economy policy","Child mortality","Outages","Election","Government-Opposition dialog",
                         "Music/entertainment","Government ministers","Protest/shortages",
                         "Colombia border","Oscar Perez/repression","Church/job offer (mixed)","National assembly","Airline (opening/closing)",
                         "Political prisoners","Russia","Sanctions","Military","Shortages healthcare","Regulations",
                         "USA/Korea","Cuba","International Organizations","Exchange Rate","Maduro","Migration crisis",
                         "Petroleum","PDVSA","Resignations","Mining/sport (mixed)","Opposition candidate","Earthquake/accidents",
                         "Shortages","Corruption","Education studies")

#Table B.1: Penalized logistic regression results
htmlreg(models_4[1,1:39],override.se = models_4[2,1:39],override.pvalues = models_4[3,1:39],
        custom.model.names=topic_names_text_fe,
        custom.coef.map = list('scale(get(i))'="Topic",
                               'off_rob_l'="DoS attack (t-1)"),
        stars = c(0.0001779359, 0.05),#Change star indicator to adjust for number of tests (0.05/281)
        file = "output/tables/Table_B1_main_analysis.html",
        caption = paste0("Iteration limit reached for: ",paste(limit_reached,collapse=", "),
                         ". Results for these models are manually removed in the formatted table.")
) #Include information on topics with iteration limit


#Create names vector for output
names_output <- cbind(names_fe,topic_names_text_fe)

names_output <- as.data.frame(names_output)

#Add categories
names_output$category <- ifelse(topic_names_text_fe %in% c("Protest/shortages","Oscar Perez/repression",
                                                           "Political prisoners","Military"),
                                "Protest/repression","Social/economic crisis")
names_output$category  <- ifelse(topic_names_text_fe %in% c("Sanctions","National assembly","International Organizations","Court sentences",
                                                            "Resignations","Corruption","Exiled opposition","Leftist opposition",
                                                            "Election","Opposition candidate"),
                                 "Political legitimacy",names_output$category )
names_output$category <- ifelse(topic_names_text_fe %in% c("Maduro","Government-Opposition dialog",
                                                           "Government ministers","Regulations",
                                                           "Food policy","Economy policy","General opinion"),
                                "Domestic politics",names_output$category )
names_output$category <- ifelse(topic_names_text_fe %in% c("Education studies","Music/entertainment","Crime_deaths",
                                                           "Weather","Earthquake/accidents","Church/job offer (mixed)"
),
"Nonpolitical",names_output$category )

names_output$category  <- ifelse(topic_names_text_fe %in% c("Russia","Colombia border","USA/Korea","Cuba","Court sentences/Lula da Silva"
),"International politics",names_output$category )


#Create vector of positively and sign. related variables for simulation
pos_names <- names(unlist(models_4[5,][(models_4[5,]==1 & models_4[4,]==1)]))

#Leave out models with iteration limit
pos_names <- setdiff(pos_names,limit_reached)

#Save pos_names and names output for later permutation robustness test
saveRDS(pos_names,"output/pos_names.rds")
saveRDS(names_output,"output/names_output.rds")
saveRDS(models_4,"output/model_4.rds")

#Create vector of negatively and sign. related variables for later robustness simulation
neg_names <- names(unlist(models_4[5,][(models_4[5,]==0 & models_4[4,]==1)]))

#Run models separately again to plot marginal effects and order them according to broader topics
#Models and parallelize procedure for main specification can be found in the helper_functions.R file
set.seed(123)
marginal_models_pos <- run_marginal_models(pos_names) 


#Plot marginal effects in broader categories:

day_pos <-plot_simulations(marginal_models_pos,
                           pos_names,names_output)
day_pos
ggsave("output/figures/Figure_2_main_results.pdf",width=16, height=8)




##### Run models for websites + date #####

model_null <- brglm(off_rob~off_rob_l+factor(website_id)+factor(date_pos_rob),data=df_merged_mean,
                    family=binomial(link="logit"),method="brglm.fit",pl=T)
vcov <- vcovCL(model_null, cluster = ~ website_id,type="HC1")
rob <- coeftest(model_null ,vcov = vcov)

set.seed(123)
margins <- margins(model_null,vcov=vcovCL(model_null,cluster = ~ website_id,type="HC1"),vce="simulation", iterations=1000,
                   change="minmax",data=df_merged_mean)
marginal_models_eco <- margins
names <- c(as.character(unique(filter(df_merged_mean,date_pos_rob>0)$date_text)),"LDV",
           sort(unique(df_merged_mean$website_id))[-1])
output_vector <- names(summary(margins)[,2])
category_vector <- c(rep("Date",18),"LDV",rep("Website ID",18))

marginal_frame_pos <- data.frame(Variable = names,
                                 Category = category_vector,
                                 Coefficient = summary(marginal_models_eco)[,2],
                                 SE =  summary(marginal_models_eco)[,3],
                                 Model = "FE-Day") 


interval1 <- -qnorm((1-0.95)/2)  # 95% multiplier
interval2 <- -qnorm((1-(1-0.05/281))/2)   # 95% Bonferroni multiplier

plot_pos <- ggplot(marginal_frame_pos, aes(colour = Model)) +  scale_colour_grey(start = 0, end = 0.8) 
plot_pos <- plot_pos + geom_hline(yintercept = 0, colour = gray(1/2), lty = 2)
plot_pos <- plot_pos +  geom_linerange(aes(x = Variable, ymin = Coefficient - SE*interval1,
                                           ymax = Coefficient + SE*interval1),
                                       lwd = 1.5, position = position_dodge(width = 0.9))
plot_pos <- plot_pos + geom_linerange(aes(x = Variable,  ymin = Coefficient - SE*interval2,
                                          ymax = Coefficient + SE*interval2),
                                      lwd = 1/2, position = position_dodge(width = 0.9))
plot_pos <- plot_pos + geom_pointrange(aes(x = Variable, y = Coefficient, ymin = Coefficient - SE*interval2,
                                           ymax = Coefficient + SE*interval2, shape = Model),
                                       lwd = 1/2, position = position_dodge(width = 0.9)) + scale_shape(solid = TRUE)
plot_pos <- plot_pos + coord_flip() + theme_bw() +theme(axis.text=element_text(size=14,face="bold"),
                                                        axis.title=element_text(size=14,face="bold")) 
plot_pos <- plot_pos + ggtitle("") + xlab("") + ylab("") + scale_size(guide = 'none') + 
  facet_wrap(~ Category, nrow=1, scales = "free_y") +
  theme(legend.text=element_text(size=10,face="bold"),
        strip.text = element_text(size=10),
        legend.position="none",
        panel.spacing.x = unit(10, "mm"))

#Figure B.1: Average Marginal Effects (AME)
plot_pos
ggsave("output/figures/Figure_B1_time_newspaper.pdf",width=16, height=10)


#### Run simulations for negative related topics ####
#Leave out models that did not converge for simulations
neg_names <- setdiff(neg_names,limit_reached)

set.seed(123)
marginal_models_neg <- run_marginal_models(neg_names) 

neg_pos <- plot_simulations(marginal_models_neg,neg_names,names_output)

#Figure D.5
neg_pos
ggsave("output/figures/Figure_D5_negative_topics.pdf",width=8, height=8)

##### Week topic reporting #####

names_fe_week <- names_fe

names_fe_week <-paste0(names_fe_week,"_sum7")
i = "sanctions_sum7"
models_week <- sapply(names_fe_week, run_model_4_strong)

#Print convergence status of models

#Look for convergence status of models
converge <- unlist(models_week[6,])

#Display models with iteration limit reached
limit_reached <- str_remove(names(converge[converge==1]),".message")

#Report results for Table D.4
htmlreg(models_week[1,1:39],override.se = models_week[2,1:39],override.pvalues = models_week[3,1:39],
        custom.model.names=topic_names_text_fe[1:39],
        custom.coef.map = list('scale(get(i))'="Topic",
                               'off_rob_l'="DoS attack (t-1)"),
        stars = c(0.0001779359, 0.05),
        file = "output/tables/Table_D4_week_analysis.html",
        caption = paste0("Iteration limit reached for: ",paste(limit_reached,collapse=", "),
                         ". Results for these models are manually removed in the formatted table.")) 


#Create variables vector for simulation
pos_names_week <- names(unlist(models_week[5,][(models_week[5,]==1 & models_week[4,]==1)
]))

#Create names vector for week output (incl. topics from main specification)
pos_names_week <-unique(c(pos_names_week,paste0(pos_names,"_sum7")))

#Leave out topic that did not converge
pos_names_week <- setdiff(pos_names_week,limit_reached)

set.seed(123)
marginal_models_pos_week <- run_marginal_models(pos_names_week) 

names_output_week <-  names_output
names_output_week$names_fe <-paste0(names_output_week$names_fe,"_sum7")
#need to define that

plot_week <- plot_simulations(marginal_models_pos_week,
                              pos_names_week,names_output_week)

#Figure D.6: Week analysis
plot_week
ggsave("output/figures/Figure_D6_week_analysis.pdf",width=16, height=8)


##Include Traffic data####

df_merged_mean$page_views_per_million <- as.numeric(df_merged_mean$page_views_per_million)
df_merged_mean$page_views_per_million_l <- as.numeric(df_merged_mean$page_views_per_million_l)


orgs <- df_merged_mean %>%
  group_by(website_id)%>%
  summarise_if(is.numeric,tibble::lst(mean),na.rm = TRUE)


#To add in qualitative Table A.1: Assessment of monitored website
orgs$page_views_per_million_mean


#Define i as character value
i = "sanctions"

#Include traffic variable in function
run_model_4_trend <- function(i){
  e1 <- myTryCatch(model <- brglm(off_rob~scale(get(i))+off_rob_l+factor(website_id)+factor(date_pos_rob)+
                                    page_views_per_million_l,data=df_merged_mean,
                                  family=binomial(link="logit"),method="brglm.fit",pl=T))
  vcov <- vcovCL(model, cluster = ~ website_id,type="HC1")
  rob <- coeftest(model ,vcov = vcov)
  se <- rob[,2]
  p_value <- rob[,4]
  sign <- ifelse(p_value[2]<0.05/281,1,0)
  pos <- ifelse(rob[2,1]>0,1,0)
  error <- ifelse(e1[[2]][1]=="Iteration limit reached",1,0)
  return(list(model,se,p_value,sign,pos,error))
}


models_4_trend <- sapply(names_fe, run_model_4_trend)

#Look for convergence status of models
converge <- unlist(models_4_trend[6,])

#Display models with iteration limit reached
limit_reached <- str_remove(names(converge[converge==1]),".message")

#Generate vector for AME simulations
pos_names_trend <- names(unlist(models_4_trend[5,][
  (models_4_trend[5,]==1 & models_4_trend[4,]==1)
]))

pos_names_trend <- unique(c(pos_names_trend,pos_names))

#Leave out models with iteration limit
pos_names_trend <- setdiff(pos_names_trend,limit_reached)


#Report Table D.1
htmlreg(models_4_trend[1,1:39],override.se = models_4_trend[2,1:39],override.pvalues = models_4_trend[3,1:39],
        custom.model.names=topic_names_text_fe[1:39],
        custom.coef.map = list('scale(get(i))'="Topic",
                               'page_views_per_million_l' = "Page views in million (t-1)"
        ),
        stars = c(0.0001779359, 0.05),
        file = "output/tables/Table_D1_analysis_trend.html",
        caption = paste0("Iteration limit reached for: ",paste(limit_reached,collapse=", "),
                         ". Results for these models are manually removed in the formatted table.")
) #Include information on topics with iteration limit


#Run models separately again to plot marginal effects and order them according to broader topics

#Rewrite functions for AMEs to include traffic variable
marginal_model_sim_trend <- function(variable_name){
  library(brglm)
  library(margins)
  library(sandwich)
  library(tidyverse)
  set.seed(123)
  formula <- as.formula(paste0("off_rob~",paste0(variable_name,"+off_rob_l+factor(website_id)+factor(date_pos_rob)+
                                    page_views_per_million_l")))
  m <-  brglm(formula,data=df_merged_mean,
              family=binomial(link="logit"),method="brglm.fit",pl=T)
  margins <- margins(m,vcov=vcovCL(m,cluster = ~ website_id,type="HC1"),vce="simulation", iterations=1000,
                     change="minmax",data=df_merged_mean)
  return(list(margins))
}

run_marginal_models_trend <- function(names){
  # Run marignal models
  start.time <- Sys.time()
  library(parallel)
  
  # Calculate the number of cores
  no_cores <- detectCores(all.tests = FALSE, logical = TRUE)
  no_cores <- no_cores-1
  
  # Initiate cluster
  cl <- makeCluster(no_cores)
  
  clusterExport(cl, list("df_merged_mean","marginal_model_sim_trend"))
  
  marginal_models <- parLapply(cl, 
                               c(names), 
                               marginal_model_sim_trend)
  
  stopCluster(cl)
  end.time <- Sys.time()
  return(marginal_models)
}


marginal_models_trend_pos <- run_marginal_models_trend(pos_names_trend) 

plot_pos_trend <- plot_simulations(marginal_models_trend_pos,
                                   pos_names_trend,names_output)

#Figure D.2: AMEs w/ traffic
plot_pos_trend
ggsave("output/figures/Figure_D2_analysis_traffic.pdf",width=16, height=8)


##### Max specification (day) ###### 

#Load dataset with maximal specification
df_merged_max <- read_delim("input/df_merged_max.csv", 
                            ";", escape_double = FALSE, col_types = cols(date_text = col_date(format = "%d/%m/%Y")), 
                            trim_ws = TRUE)

df_merged_max <- df_merged_max %>%
  group_by(date_text) %>%
  mutate(date_pos_rob=  ifelse(var(off_rob)>0,date_text,0)) %>%
  ungroup()

#Define i as character value
i = "sanctions"



#Rewrite BRGLM functions to include dataset that is aggregated to the maximal value
run_model_4_max <- function(i){
  e1 <- myTryCatch(model <- brglm(off_rob~scale(get(i))+factor(website_id)+factor(date_pos_rob)+off_rob_l,data=df_merged_max,
                                  family=binomial(link="logit"),method="brglm.fit",pl=T))
  vcov <- vcovCL(model, cluster = ~ website_id,type="HC1")
  rob <- coeftest(model ,vcov = vcov)
  se <- rob[,2]
  p_value <- rob[,4]
  sign <- ifelse(p_value[2]<0.05/281,1,0)
  pos <- ifelse(rob[2,1]>0,1,0)
  error <- ifelse(e1[[2]][1]=="Iteration limit reached",1,0)
  return(list(model,se,p_value,sign,pos,error))
}

models_4_max <- sapply(names_fe, run_model_4_max)

#Look for convergence status of models
converge <- unlist(models_4_max[6,])

#Display models with iteration limit reached
limit_reached <- str_remove(names(converge[converge==1]),".message")


#Report model Table D.3

htmlreg(models_4_max[1,1:39],override.se = models_4_max[2,1:39],override.pvalues = models_4_max[3,1:39],
        custom.model.names=topic_names_text_fe[1:39],
        custom.coef.map = list('scale(get(i))'="Topic",
                               'off_rob_l'="DoS attack (t-1)"),
        stars = c(0.0001779359, 0.05),
        file = "output/tables/Table_D3_max_analysis.html",
        caption = paste0("Iteration limit reached for: ",paste(limit_reached,collapse=", "),
                         ". Results for these models are manually removed in the formatted table.")
)

#Marginal effect for sign. relationships
pos_names_max <- names(unlist(models_4_max [5,][
  (models_4_max [5,]==1 & models_4_max [4,]==1)
]))

pos_names_max <- unique(c(pos_names_max,pos_names))

#Rewrite again AME simulation functions
marginal_model_sim_max <- function(variable_name){
  library(brglm)
  library(margins)
  library(sandwich)
  library(tidyverse)
  set.seed(123)
  formula <- as.formula(paste0("off_rob~",paste0(variable_name,"+factor(website_id)+factor(date_pos_rob)+off_rob_l")))
  m <-  brglm(formula,data=df_merged_max,
              family=binomial(link="logit"),method="brglm.fit",pl=T)
  margins <- margins(m,vcov=vcovCL(m,cluster = ~ website_id,type="HC1"),vce="simulation", iterations=1000,
                     change="minmax",data=df_merged_max)
  return(list(margins))
}

run_marginal_models_max <- function(names){
  # Run marignal models
  start.time <- Sys.time()
  library(parallel)
  
  # Calculate the number of cores
  no_cores <- detectCores(all.tests = FALSE, logical = TRUE)
  no_cores <- no_cores-1
  
  # Initiate cluster
  cl <- makeCluster(no_cores)
  
  clusterExport(cl, list("df_merged_max","marginal_model_sim_max"))
  
  marginal_models <- parLapply(cl, 
                               c(names), 
                               marginal_model_sim_max)
  
  stopCluster(cl)
  end.time <- Sys.time()
  return(marginal_models)
}

set.seed(123)
marginal_models_pos_max<- run_marginal_models_max(pos_names_max) 


plot_pos_max <- plot_simulations(marginal_models_pos_max,pos_names_max,names_output)

#Figure D.4
plot_pos_max
ggsave("output/figures/Figure_D4_analysis_max.pdf",width=16, height=8)



##### Shorter attacks #####
#Create names vector for output
i = "sanctions"

df_merged_mean <- df_merged_mean %>%
  group_by(date_text) %>%
  mutate(date_pos=  ifelse(var(off)>0,date_text,0)) %>%
  ungroup()

#Rewrite again function
run_model_4 <- function(i){
  e1 <- myTryCatch(model <- brglm(off~scale(get(i))+off_l+factor(website_id)+
                                    factor(date_pos),data=df_merged_mean,
                                  family=binomial(link="logit"),method="brglm.fit",pl=T))
  vcov <- vcovCL(model, cluster = ~ website_id,type="HC1")
  rob <- coeftest(model ,vcov = vcov)
  se <- rob[,2]
  p_value <- rob[,4]
  sign <- ifelse(p_value[2]<0.05/281,1,0)
  pos <- ifelse(rob[2,1]>0,1,0)
  error <- ifelse(e1[[2]][1]=="Iteration limit reached",1,0) #Save whether there has been an error
  
  return(list(model,se,p_value,sign,pos,error))
}


models_4_short <- sapply(names_fe, run_model_4)

#Look for convergence status of models
converge <- unlist(models_4_short[6,])

#Display models with iteration limit reached
limit_reached <- str_remove(names(converge[converge==1]),".message")


#Report model Table D.2
htmlreg(models_4_short[1,1:39],override.se = models_4_short[2,1:39],override.pvalues = models_4_short[3,1:39],
        custom.model.names=topic_names_text_fe,
        custom.coef.map = list('scale(get(i))'="Topic",
                               'off_l'="DoS attack (t-1)"),
        stars = c(0.0001779359, 0.05),#Change star indicator to adjust for number of tests (0.05/281)
        file = "output/tables/Table_D2_short_duration.html",
        caption = paste0("Iteration limit reached for: ",paste(limit_reached,collapse=", "),
                         ". Results for these models are manually removed in the formatted table.")
) #Include information on topics with iteration limit


#Retrieve pos. and sign. relationships
pos_names_short <- names(unlist(models_4_short[5,][(models_4_short[5,]==1 & models_4_short[4,]==1)
]))
pos_names_short <- unique(c(pos_names_short,pos_names))

#Leave out models with iteration limit
pos_names_short <- setdiff(pos_names_short,limit_reached)


#Rewrite AME simulations
marginal_model_sim_short <- function(variable_name){
  library(brglm)
  library(margins)
  library(sandwich)
  library(tidyverse)
  set.seed(123)
  formula <- as.formula(paste0("off~",paste0(variable_name,"+factor(website_id)+factor(date_pos)+off_l")))
  m <-  brglm(formula,data=df_merged_mean,
              family=binomial(link="logit"),method="brglm.fit",pl=T)
  margins <- margins(m,vcov=vcovCL(m,cluster = ~ website_id,type="HC1"),vce="simulation", iterations=1000,
                     change="minmax",data=df_merged_mean)
  return(list(margins))
}

run_marginal_models_short <- function(names){
  # Run marignal models
  start.time <- Sys.time()
  library(parallel)
  
  # Calculate the number of cores
  no_cores <- detectCores(all.tests = FALSE, logical = TRUE)
  no_cores <- no_cores-1
  
  # Initiate cluster
  cl <- makeCluster(no_cores)
  
  clusterExport(cl, list("df_merged_mean","marginal_model_sim_short"))
  
  marginal_models <- parLapply(cl, 
                               c(names), 
                               marginal_model_sim_short)
  
  stopCluster(cl)
  end.time <- Sys.time()
  return(marginal_models)
}

set.seed(123)
marginal_models_pos_short <- run_marginal_models_short(pos_names_short) 


plot_pos_short <- plot_simulations(marginal_models_pos_short,pos_names_short,names_output)

#Figure D.3
plot_pos_short
ggsave("output/figures/Figure_D3_analysis_short.pdf",width=16, height=8)



##### Comparison Cloudflare vs. Non-Cloudflare protected on attack day#####


df_merged_mean_attacked <- filter(df_merged_mean,date_pos_rob!=0)

df_merged_mean_attacked <- mutate(df_merged_mean_attacked,cloudflare=ifelse(website_id %in%
                                                                              c(32,41,42,45,50,51),"cloud","non-cloud"))


agg_comparison_cloud <- function(topic){
  test <- t.test(filter(df_merged_mean_attacked,cloudflare=="cloud" & off_rob==1)[,topic],filter(df_merged_mean_attacked,cloudflare=="non-cloud" & off_rob==1)[,topic])
  output <- c(test$estimate,test$statistic,test$p.value)
  names(output) <- c("Cloudflare (mean)","Non-Cloudflare (mean)","t-value","p-value")
  return(output)
}

t_tests_cloud <- sapply(names_fe,agg_comparison_cloud)


t_tests_cloud <- t(t_tests_cloud)
rownames(t_tests_cloud) <- topic_names_text_fe

#Table D.6: T-tests between topic distributions between Cloudflare protected vs. unprotected servers on attack days.")
print(xtable(t_tests_cloud,digits = 2),type="html",
      file="output/tables/Table_D6_Comparison.html")